Setup

knitr::opts_chunk$set(echo = TRUE)
knitr::opts_chunk$set(warning = FALSE, message = FALSE)

# Load libraries
library(caret)
library(tidyverse)
library(cowplot)
library(ggplot2)
library(ggpattern)
library(ggsci)
library(utils)
library(reshape2)
library(stringr)
library(gridGraphics)
library(zellkonverter)
library(SingleCellExperiment)
library(cytomapper)
library(scater)
library(ggpubr)
library(dittoSeq)
# Set general input paths to all analysis files
analysis.path <- "/mnt/bb_dqbm_volume/analysis"

# Set path to masks
tonsil.path <- "/mnt/bb_dqbm_volume/data/Tonsil_th152"
lung.path <- "/mnt/bb_dqbm_volume/data/Immucan_lung"
# Import helper fncs
source("../analysis/helpers/helper_fnc.R")

Read Inputs

first, we look at U computed using different training datasets. For this, we read in the results from CISI training on the Tonsil th152 and the Immucan lung dataset (tonsil_vs_lung.py).

## Read results
# Read in all results for tonsil and lung comparison into one dataframe
files <- list.files(analysis.path, 'simulation_results.txt',
                    full.names=TRUE, recursive=TRUE)
# TODO: change this line if more files come along and need to exclude
files <- files[grepl(
  "tonsil_vs_lung|Tonsil_th152/training/full|Immucan_lung/training/subset", files)]
res <- lapply(files, read_results, type="res") %>% 
  bind_rows() 

# Get information about best U files
u_files <- res %>% 
  dplyr::filter(dataset==training) %>% 
  dplyr::select(k, training, `Best crossvalidation fold`, datasize) %>% 
  distinct()

# Harmonize all dataset names
patterns <- unique(unlist(lapply(res$training, function(name){
  if(length(str_split(name, "_")[[1]])==1){
    name
  }
})))

res$training <- unlist(lapply(res$training, function(pat){
  patterns[str_detect(pat, fixed(patterns, ignore_case=TRUE))]
}))
res$dataset <- unlist(lapply(res$dataset, function(pat){
  patterns[str_detect(pat, fixed(patterns, ignore_case=TRUE))]
}))

## Read U
u <- lapply(1:(nrow(u_files)), function(i){
  file <- file.path(analysis.path, u_files[i, "training"], "training",
                    u_files[i, "datasize"], 
                    paste0("k_", u_files[i, "k"]),
                    paste0("crossvalidation_", u_files[i, "Best crossvalidation fold"]),
                    "gene_modules.csv")
  u_temp <- read_U(file, "training")
}) %>% bind_rows() %>%
  dplyr::rename(dataset=rep)

## Read X_test and X_simulated
# Specify X_test and X_simulated files to be read in
X.files <- list.files(analysis.path, "X_", full.names=TRUE, recursive=TRUE)
# TODO: change this line if more files come along and need to exclude
X.files <- X.files[grepl(
  "tonsil_vs_lung|Tonsil_th152/training/full|Immucan_lung/training/subset", X.files)]
X.files <- X.files[!grepl("_processed", X.files)]

# Read in all sce experiments from saved anndata and save additional info in metadata
# (e.g. which dataset is used in training and testing, which k was used and if it
# is the ground truth or simulated X)
# Remove neg. values and transform counts?
X.list <- lapply(X.files, read_results, type="x")
X.list <- lapply(X.list, function(sce.temp){
  pat <- metadata(sce.temp)
  metadata(sce.temp)$dataset <- patterns[str_detect(pat$dataset, 
                                                    fixed(patterns, ignore_case=TRUE))]
  metadata(sce.temp)$training <- patterns[str_detect(pat$training, 
                                                     fixed(patterns, ignore_case=TRUE))]
  assay.name <- names(assays(sce.temp))
  for (a in assay.name[-1]) {
    assay(sce.temp, a) <- NULL
  }
  assay(sce.temp, "exprs") <- asinh(counts(sce.temp)/1)
  
  sce.temp
})

# Add reduced dimensions
set.seed(220225)
X.list <- lapply(X.list, function(sce){
  sce <- runUMAP(sce, exprs_values="exprs") 
  sce <- runTSNE(sce, exprs_values="exprs")
  sce
})

# Save SCE to avoid computing UMAP, TSNE, ... all the time since it is computationally
# expensive
# lapply(1:(length(X.list)), function(i){saveRDS(X.list[i], 
#                                                paste0(gsub("\\..*", "", X.files[i]),
#                                                       "_processed.",
#                                                       gsub(".*\\.", "", basename(X.files[i]))))})

## Read masks
# Read in masks for tonsil data
masks.tonsil <- loadImages(file.path(tonsil.path, "steinbock/masks_deepcell"), as.is = TRUE)
mcols(masks.tonsil) <- DataFrame(sample_id=names(masks.tonsil))

# Read in masks for lung data
masks.lung <- loadImages(file.path(lung.path, "masks"), as.is = TRUE)
mcols(masks.lung) <- DataFrame(sample_id=names(masks.lung))

## Read random forest classifier
# Read RF classifier for the lung dataset
rffit.lung <- readRDS(file.path(lung.path, "rffit_v7_all_markers.rds"))

Results

# For each different measurement of training results, plot a barplot comparing
# diff. k-sparsities, simulation types and training-test set combinations
for (measure in names(res)[2:10]) {
  cat('##### ', measure, '\n')
  
  # Reorder dataframe for plotting
  data_temp <- res %>%
    dplyr::select(dataset, training, simulation, k, !!measure) %>%
    mutate(group=paste(training, dataset, sep="_"))
  
  # Create barplot
  barplot <- ggplot(data_temp, aes(x=group, y=!!sym(measure), fill=k, pattern=simulation)) +
    geom_bar_pattern(stat="identity",
                     position=position_dodge(preserve="single"),
                     width=0.6,
                     color="black", 
                     pattern_fill="black",
                     pattern_angle=45,
                     pattern_density=0.3,
                     pattern_spacing=0.01,
                     pattern_key_scale_factor=0.6) +
    scale_y_continuous(limits=c(0, 1)) +
    scale_fill_npg() +
    scale_pattern_manual(values=c(composite="stripe", noisy="none"), 
                         labels=c("Composite", "Noisy")) +
    labs(x="Training_Test Dataset", y=str_to_title(measure), pattern="Simulation Type") + 
    guides(pattern=guide_legend(override.aes=list(fill="white")),
           fill=guide_legend(override.aes=list(pattern="none"))) +
    theme_cowplot()
  
  print(barplot)
  cat("\n\n")
}
Overall pearson

Overall spearman

Gene average

Sample average

Sample dist pearson

Sample dist spearman

Gene dist pearson

Gene dist spearman

Matrix coherence (90th ptile)

Heatmaps of U

cor <- plot_U(u, "k", "dataset")
k = 1

k = 3

Module Comparisons

plot_U_membership(u, "k", "dataset")
k = 1

k = 3

#### Mantel Test between U’s

# Calculate mantel test between U's of tonsil and lung for all k's
mantel <- lapply(cor, function(l){
  mantel_test(l[[1]], l[[2]])$mantel_r
  }) %>%
  as.data.frame(col.names=names(cor), check.names=FALSE)

knitr::kable(mantel, digits=2,
             col.names=paste0("k = ", names(mantel)))
k = 1 k = 3
0.67 0.42

CD20 of Test Image (Tonsil th152, k=1)

# Subset to training and test of tonsil data
X.plotList <- keep(X.list, function(x){
  metadata(x)$training=="tonsil" & metadata(x)$dataset=="tonsil" & metadata(x)$k=="1"})
names(X.plotList) <- lapply(X.plotList, 
                            function(x){metadata(x)$ground_truth})
# Call plot_cells to get individual plots for each roi for decomposed and true
# results
X.imgList <- plot_cells(X.plotList, masks.tonsil, 
                        "CD20")
# Plot decomposed vs true results for test roi (002)
X.img <- plot_grid(plotlist=append(X.imgList[grepl("20220520_TsH_th152_cisi1_002",
                                                   names(X.imgList))],
                                   X.imgList[grepl("legend",
                                                   names(X.imgList))]),
                   ncol=2, labels=names(X.plotList),
                   label_size=15, hjust=c(-2, -1.5), 
                   vjust=1, scale=0.9)
X.img

CD20 of Test Image Overlayed (Tonsil th152, k=1)

# Rename list of tonsil data for nicer titles in plotting
X.plotListRenamed <- lapply(names(X.plotList), function(n){
  rownames(X.plotList[[n]]) <- paste(rownames(X.plotList[[n]]),
                                     n, sep="\n")
  reducedDims(X.plotList[[n]]) <- NULL
  X.plotList[[n]]
})
names(X.plotListRenamed) <- names(X.plotList)
# Add decomposed and true dataset of tonsil into one SCE
X.overlayed <- do.call("rbind", X.plotListRenamed)

# Define protein of interest
pois <- c("CD20\nsimulated", "CD20\nground_truth")

# Plot cells coloured according to decomposed and true poi values 
# (if similar in both should have a mixture of colours, e.g. orange) 
X.imgDiff <- plotCells(mask=masks.tonsil[unique(colData(X.overlayed)$sample_id)], 
                       object=X.overlayed,
                       cell_id="ObjectNumber", img_id="sample_id", colour_by=pois,
                       return_plot=TRUE,  image_title=list(cex=1),
                       scale_bar=list(cex=1, lwidth=5),
                       legend=list(colour_by.title.cex=0.7, colour_by.labels.cex=0.7))

ggdraw(X.imgDiff$plot)

Scatterplots of arcsinh Transformed Counts (Immucan Lung, k=1)

Plot of arcsinh transformed counts coloured by defined celltype.

# Subset to dataset trained and tested on the Immucan lung dataset
X.exprsList <- keep(X.list, function(x){
  metadata(x)$training=="lung" & metadata(x)$dataset=="lung" & metadata(x)$k=="1"})
names(X.exprsList) <- lapply(X.exprsList,
                             function(x){metadata(x)$ground_truth})

# Plot asinh transformed counts of proteins of interest of decomposed and true 
# datasets coloured by different celltypes
X.Exprs <- plot_grid(plot_exprs(X.exprsList, "B", "CD3", "CD20"),
                     plot_exprs(X.exprsList, "BnT", "CD3", "CD20"),
                     plot_exprs(X.exprsList, "Neutro", "MPO", "CD15"),
                     ncol=1)
print(X.Exprs)

Scatterplot Results with and without Normalization in Training

(Immucan Lung, k=1, arcsinh)

Next, we have a look of normalization in training.

# # TODO: Check that this works and figure out what exactly we want to look at here
# 
# # Subset to dataset trained and tested on the Immucan lung dataset with k=1
# # with and without normalization for the decomposed counts
# X.normList <- keep(X.list, function(x){
#   metadata(x)$training=="lung" & metadata(x)$dataset=="lung" &
#     (metadata(x)$k=="1" | metadata(x)$k=="norm") &
#     metadata(x)$ground_truth=="simulated"})
# names(X.normList) <- lapply(X.normList,
#                             function(x){metadata(x)$k})
# 
# # Plot asinh transformed counts of proteins of interest of decomposed data trained
# # with and without normalization
# X.Norms <- plot_grid(plot_exprs(X.exprsList, "", "CD3", "CD20"),
#                      plot_exprs(X.exprsList, "", "CD3", "VISTA"),
#                      ncol=1)
# print(X.Norms)

Results per Protein (k=1, arcsinh)

Scatterplot of ground truth vs decomposed results per protein for k=1 and asinh transformed counts.

# Add all X_test counts together into dataframe for easier plotting
aoi <- "exprs"
X.df <- lapply(X.list, function(sce){
  counts.long <- as.data.frame(assays(sce)[[aoi]]) %>%
    mutate(protein=rownames(.)) %>%
    melt() %>%
    dplyr::rename(!!as.symbol(metadata(sce)$ground_truth):=value,
                  cell=variable) %>%
    mutate(k=metadata(sce)$k,
           dataset=paste(metadata(sce)$training, metadata(sce)$dataset, sep="_"))
  
}) %>% bind_rows() %>%
  group_by(protein, cell, k, dataset) %>%
  summarise_all(na.omit)

# For each test/training dataset combination plot for each true vs decomposed
# results in scatter plot and add diagonal and regression line, as well as
# R (pearson correlation coefficient)
for (i in unique(X.df$dataset)) {
  cat('#####', i, '\n')
  
  proteinPlot <- ggscatter(X.df %>%
                             dplyr::filter(k=="1" & dataset==i), 
                           x="ground_truth", y="simulated",
                           add="reg.line",
                           color=pal_npg("nrc")("1"),
                           add.params=list(color=pal_npg("nrc")("4")[4],
                                           size=2)) +
    facet_wrap(~ protein, scales="free", ncol=5) +
    theme_cowplot(10) +
    geom_abline(slope=1, linetype="dashed") +
    stat_cor(size=2)
  
  print(proteinPlot)
  cat('\n\n')
}
lung_lung

tonsil_lung

lung_tonsil

tonsil_tonsil

Correlations per Protein (k=1, arcsinh)

Correlation between ground truth and decomposed results per protein for k=1 and asinh transformed counts.

# Calculate correlations between ground truth and simulated data for each protein
X.cor <- X.df %>%
  group_by(k, dataset, protein) %>%
  summarize(correlation=cor(ground_truth, simulated),
            median=median(ground_truth),
            max=max(ground_truth),
            var=var(ground_truth))

# Plot correlations for k=1 for each test/training dataset combination
for (i in unique(X.df$dataset)) {
  cat('#####', i, '\n')
  
  proteinCorr <- ggplot(X.cor %>% 
                          dplyr::filter(k=="1" & dataset==i),
                        aes(x=protein, y=correlation, fill=protein)) +
    geom_bar(stat="identity") +
    theme_cowplot() +
    scale_fill_igv() +
    guides(fill=FALSE) +
    coord_flip()
  
  print(proteinCorr)
  cat('\n\n')
}
lung_lung

tonsil_lung

lung_tonsil

tonsil_tonsil

Correlations per Protein vs Protein Characteristics (k=1, arcsinh)

Correlation between ground truth and decomposed results per protein for k=1 and asinh transformed counts compared to protein characteristics of gorund truth: Median, max and variance.

# Define charachteristics of interest
coi <- c("median", "max", "var")

# Plot correlations for k=1 for each test/training dataset combination
for (i in coi) {
  cat('#####', i, '\n')
  
  proteinChar <- ggscatter(X.cor %>%
                             dplyr::filter(k=="1"), 
                           x="correlation", y=i,
                           add="reg.line",
                           color=pal_npg("nrc")("1"),
                           add.params=list(color=pal_npg("nrc")("4")[4],
                                           size=2)) +
    facet_wrap(~ dataset, scales="free", ncol=2) +
    theme_cowplot(10) +
    stat_cor(size=2)
  
  print(proteinChar)
  cat('\n\n')
}
median

max

var

Reduced Dimension Plots

# Calculate correlations between all ground truth cells and for each cell retain
# its max correlation to another cell
# If max >= 0.995, then the cell is marked as having a highly correlated cell in
# ground truth
sce.maxCor <- counts(X.exprsList[[2]]) %>%
  cor %>%
  as.data.frame %>%
  rownames_to_column(var='cell_i') %>%
  gather(key="cell_j", value="cor", -cell_i) %>%
  dplyr::filter(!(cell_i==cell_j)) %>%
  group_by(cell_i) %>%
  summarise(max_cor=max(cor, na.rm=TRUE)) %>%
  dplyr::filter(max_cor>=0.995)

# Add above information to sce experiments
colData(X.exprsList[[1]])$high_cor <- ifelse(
  colData(X.exprsList[[1]])$cell_id %in% sce.maxCor$cell_i, 
  "high", 
  "low")
colData(X.exprsList[[2]])$high_cor <- colData(X.exprsList[[1]])$high_cor

# Create empty list for plots
X.redDimlist <- list
# Extract for simulated and true results for UMAP and TSNE and plot and
# colour highly correlated ground truth cells
for(sce in X.exprsList){
  for(method in c("UMAP", "TSNE")){
    X.redDimlist <- append(X.redDimlist, 
                           list(dittoDimPlot(sce, var="high_cor", 
                                             reduction.use=method, size=0.2) +
                                  scale_color_manual(values=c("high"="red", "low"="grey"),
                                                     name="Highly Correlated to Another Cell",
                                                     labels=c("high"="true", "low"="false")) +
                                  theme_cowplot() +
                                  theme(legend.position="bottom") +
                                  guides(color=guide_legend(nrow=1)))) 
  }
}
# Remove empty plot at the beginning and add legend to the end of the plot list
X.redDimlist <- c(lapply(X.redDimlist[-1], function(p){
  p + theme(legend.position="none")
}), list(get_legend(X.redDimlist[2])))

# Plot simulated and true reduced dim on top of each other
X.redDimPlot <- plot_grid(plotlist=X.redDimlist,
                          ncol=2, labels=rep(names(X.exprsList), each=2),
                          label_size=15, hjust=c(-2, -1.5), vjust=1, scale=0.9)

print(X.redDimPlot)

#### Cell Phenotyping

# # Subset to dataset trained and tested on the Immucan lung dataset
# lungSim.list <- keep(X.list, function(x){
#   metadata(x)$dataset=="lung" & metadata(x)$k=="1" & metadata(x)$ground_truth=="simulated"})
# names(lungSim.list) <- lapply(lungSim.list,
#                               function(x){metadata(x)$training})
# lungSim.exprs <- lapply(lungSim.list, function(sce){
#   cur_mat <- t(assay(sce, "exprs"))
#   # Predict cell phenotypes in test data
#   cur_pred <- predict(rffit.lung,
#                       newdata=cur_mat)
#   cur_pred
# })
# 
# cm <- confusionMatrix(data=cur_pred,
#                       reference=factor(lungSim.list$cell_labels),
#                       mode="everything")
# 
# cm